Bank Marketing Data Set

Context

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed. The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).

Objective 1: Display the ability to perform EDA and build a logistic regression model

# Reading datafile
        bank.additional.full <- read_delim("../Data/bank-additional-full.csv", delim = ";")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   age = col_double(),
##   duration = col_double(),
##   campaign = col_double(),
##   pdays = col_double(),
##   previous = col_double(),
##   emp.var.rate = col_double(),
##   cons.price.idx = col_double(),
##   cons.conf.idx = col_double(),
##   euribor3m = col_double(),
##   nr.employed = col_double()
## )
## See spec(...) for full column specifications.
#Missing data? 
        vis_miss(bank.additional.full)

To ensure performance metrics are comparable across models, setting the training and test sets here in the beginning.

## TRANSFORMING DATA (Suchi)

#Converting the categorical variables as factors
        bank.additional.full$y <- as.factor(bank.additional.full$y)
        bank.additional.full$job <- as.factor(bank.additional.full$job)
        bank.additional.full$marital <- as.factor(bank.additional.full$marital)
        bank.additional.full$education <- as.factor(bank.additional.full$education)
        bank.additional.full$default <- as.factor(bank.additional.full$default)
        bank.additional.full$housing <- as.factor(bank.additional.full$housing)
        bank.additional.full$loan <- as.factor(bank.additional.full$loan)
        bank.additional.full$contact <- as.factor(bank.additional.full$contact)
        bank.additional.full$day_of_week <- as.factor(bank.additional.full$day_of_week)
        bank.additional.full$poutcome <- as.factor(bank.additional.full$poutcome)
        bank.additional.full$month <- as.factor(bank.additional.full$month)

#Removing records from bank.additional.full where value of default = 'yes' as it might cause issue when picked in non-balance way.
        bank.additional.full <- bank.additional.full %>% filter (default!='yes')    

# Separating out yes and no observations
        bank.additional.no <- bank.additional.full %>% filter (y=='no')
        bank.additional.yes <- bank.additional.full %>% filter (y=='yes')

 # Picking 1000 sample each from yes and no
        set.seed(1234)
        index.no<-sample(1:nrow(bank.additional.no),2000,replace=FALSE)
        index.yes<-sample(1:nrow(bank.additional.yes),2000,replace=FALSE)

        bank.additional.sample.no<-bank.additional.no[index.no,]
        bank.additional.sample.yes<-bank.additional.yes[index.yes,]
        
        bank.additional.sample <- rbind(bank.additional.sample.no,bank.additional.sample.yes)
        
#Splitting train and test data set
        set.seed(1234)
        index<-sample(1:4000,3000,replace=FALSE)
        bank.additional.sample <- as.data.frame(bank.additional.sample)
        bank.additional.sample.train<-bank.additional.sample[index,]
        bank.additional.sample.train <- as.data.frame(bank.additional.sample.train)
        bank.additional.sample.test <-bank.additional.sample[-index,]
        bank.additional.sample.test <- as.data.frame(bank.additional.sample.test)
#summary of dataset (Hollie)
        summary(bank.additional.full)
##       age                 job            marital     
##  Min.   :17.00   admin.     :10422   divorced: 4612  
##  1st Qu.:32.00   blue-collar: 9254   married :24925  
##  Median :38.00   technician : 6741   single  :11568  
##  Mean   :40.02   services   : 3969   unknown :   80  
##  3rd Qu.:47.00   management : 2924                   
##  Max.   :98.00   retired    : 1720                   
##                  (Other)    : 6155                   
##                education        default         housing           loan      
##  university.degree  :12168   no     :32588   no     :18620   no     :33947  
##  high.school        : 9514   unknown: 8597   unknown:  990   unknown:  990  
##  basic.9y           : 6045   yes    :    0   yes    :21575   yes    : 6248  
##  professional.course: 5241                                                  
##  basic.4y           : 4176                                                  
##  basic.6y           : 2292                                                  
##  (Other)            : 1749                                                  
##       contact          month       day_of_week    duration     
##  cellular :26141   may    :13769   fri:7827    Min.   :   0.0  
##  telephone:15044   jul    : 7174   mon:8514    1st Qu.: 102.0  
##                    aug    : 6176   thu:8623    Median : 180.0  
##                    jun    : 5318   tue:8087    Mean   : 258.3  
##                    nov    : 4100   wed:8134    3rd Qu.: 319.0  
##                    apr    : 2632               Max.   :4918.0  
##                    (Other): 2016                               
##     campaign          pdays          previous            poutcome    
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.000   failure    : 4251  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000   nonexistent:35561  
##  Median : 2.000   Median :999.0   Median :0.000   success    : 1373  
##  Mean   : 2.568   Mean   :962.5   Mean   :0.173                      
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000                      
##  Max.   :56.000   Max.   :999.0   Max.   :7.000                      
##                                                                      
##   emp.var.rate      cons.price.idx  cons.conf.idx     euribor3m    
##  Min.   :-3.40000   Min.   :92.20   Min.   :-50.8   Min.   :0.634  
##  1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344  
##  Median : 1.10000   Median :93.75   Median :-41.8   Median :4.857  
##  Mean   : 0.08183   Mean   :93.58   Mean   :-40.5   Mean   :3.621  
##  3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961  
##  Max.   : 1.40000   Max.   :94.77   Max.   :-26.9   Max.   :5.045  
##                                                                    
##   nr.employed     y        
##  Min.   :4964   no :36545  
##  1st Qu.:5099   yes: 4640  
##  Median :5191              
##  Mean   :5167              
##  3rd Qu.:5228              
##  Max.   :5228              
## 
        summary(bank.additional.sample)
##       age                 job           marital                   education   
##  Min.   :17.00   admin.     :1038   divorced: 466   university.degree  :1293  
##  1st Qu.:32.00   blue-collar: 773   married :2327   high.school        : 893  
##  Median :38.00   technician : 653   single  :1199   professional.course: 511  
##  Mean   :40.64   services   : 335   unknown :   8   basic.9y           : 496  
##  3rd Qu.:48.00   management : 293                   basic.4y           : 418  
##  Max.   :98.00   retired    : 261                   basic.6y           : 202  
##                  (Other)    : 647                   (Other)            : 187  
##     default        housing          loan           contact         month     
##  no     :3362   no     :1754   no     :3287   cellular :2862   may    :1070  
##  unknown: 638   unknown: 100   unknown: 100   telephone:1138   jul    : 634  
##  yes    :   0   yes    :2146   yes    : 613                    aug    : 584  
##                                                                jun    : 499  
##                                                                nov    : 396  
##                                                                apr    : 348  
##                                                                (Other): 469  
##  day_of_week    duration         campaign          pdays        previous     
##  fri:717     Min.   :   5.0   Min.   : 1.000   Min.   :  0   Min.   :0.0000  
##  mon:789     1st Qu.: 150.0   1st Qu.: 1.000   1st Qu.:999   1st Qu.:0.0000  
##  thu:846     Median : 271.0   Median : 2.000   Median :999   Median :0.0000  
##  tue:808     Mean   : 397.9   Mean   : 2.313   Mean   :893   Mean   :0.3147  
##  wed:840     3rd Qu.: 538.2   3rd Qu.: 3.000   3rd Qu.:999   3rd Qu.:0.0000  
##              Max.   :3509.0   Max.   :37.000   Max.   :999   Max.   :6.0000  
##                                                                              
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    : 486   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:3121   1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70  
##  success    : 393   Median :-0.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.4606   Mean   :93.49   Mean   :-40.29  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.634   Min.   :4964   no :2000  
##  1st Qu.:1.250   1st Qu.:5076   yes:2000  
##  Median :4.021   Median :5191             
##  Mean   :2.996   Mean   :5137             
##  3rd Qu.:4.959   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 
        summary(bank.additional.sample.train)
##       age                 job          marital                   education  
##  Min.   :17.00   admin.     :781   divorced: 335   university.degree  :970  
##  1st Qu.:31.00   blue-collar:570   married :1737   high.school        :685  
##  Median :38.00   technician :499   single  : 921   professional.course:380  
##  Mean   :40.61   services   :249   unknown :   7   basic.9y           :356  
##  3rd Qu.:48.00   management :222                   basic.4y           :306  
##  Max.   :98.00   retired    :199                   basic.6y           :150  
##                  (Other)    :480                   (Other)            :153  
##     default        housing          loan           contact         month    
##  no     :2520   no     :1297   no     :2467   cellular :2151   may    :808  
##  unknown: 480   unknown:  75   unknown:  75   telephone: 849   jul    :475  
##  yes    :   0   yes    :1628   yes    : 458                    aug    :439  
##                                                                jun    :362  
##                                                                nov    :300  
##                                                                apr    :262  
##                                                                (Other):354  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:554     Min.   :   5.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:570     1st Qu.: 147.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:642     Median : 268.5   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:606     Mean   : 391.9   Mean   : 2.328   Mean   :894.1   Mean   :0.3187  
##  wed:628     3rd Qu.: 524.2   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :3509.0   Max.   :37.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    : 366   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:2339   1st Qu.:-1.8000   1st Qu.:92.89   1st Qu.:-42.70  
##  success    : 295   Median :-0.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.4622   Mean   :93.49   Mean   :-40.27  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.634   Min.   :4964   no :1513  
##  1st Qu.:1.244   1st Qu.:5076   yes:1487  
##  Median :4.076   Median :5191             
##  Mean   :2.992   Mean   :5136             
##  3rd Qu.:4.959   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 
#boxplot of response variable (y) versus all other continous variables (Suchi)
        bank.additional.sample %>% ggplot(aes(x = y, y=age)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by Age") + ylab("Age")

        bank.additional.sample %>% ggplot(aes(x = y, y=duration)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by duration") + ylab("duration")

        bank.additional.sample %>% ggplot(aes(x = y, y=campaign)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success by campaign") + ylab("campaign")

        bank.additional.sample %>% ggplot(aes(x = y, y=pdays)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success") + ylab("Days after last contact")

        bank.additional.sample %>% ggplot(aes(x = y, y=previous)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("Number of contacts done before")

        bank.additional.sample %>% ggplot(aes(x = y, y=emp.var.rate  )) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("employment variation rate - quarterly indicator")

        bank.additional.sample %>% ggplot(aes(x = y, y=cons.price.idx)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("consumer price index - monthly indicator")

        bank.additional.sample %>% ggplot(aes(x = y, y=cons.conf.idx)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("consumer confidence index - monthly indicator")

        bank.additional.sample %>% ggplot(aes(x = y, y=euribor3m)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success") + ylab("euribor 3 month rate - daily indicator")

        bank.additional.sample %>% ggplot(aes(x = y, y=nr.employed)) + geom_boxplot(fill="red") + labs(title = "Term Deposits Success ") + ylab("number of employees - quarterly indicator")

#Visualize Categorical variables (Suchi)
#Table of counts are helpful for categorical variables
        attach(bank.additional.sample)
## The following object is masked from package:MASS:
## 
##     housing
        ftable(addmargins(table(y,job))) 
##     job admin. blue-collar entrepreneur housemaid management retired self-employed services student technician unemployed unknown  Sum
## y                                                                                                                                     
## no         472         473           74        51        153      67            68      189      38        351         46      18 2000
## yes        566         300           56        45        140     194            69      146     100        302         64      18 2000
## Sum       1038         773          130        96        293     261           137      335     138        653        110      36 4000
        ftable(addmargins(table(y,marital)))
##     marital divorced married single unknown  Sum
## y                                               
## no               249    1196    552       3 2000
## yes              217    1131    647       5 2000
## Sum              466    2327   1199       8 4000
        ftable(addmargins(table(y,education))) 
##     education basic.4y basic.6y basic.9y high.school illiterate professional.course university.degree unknown  Sum
## y                                                                                                                 
## no                 203      127      302         456          0                 249               587      76 2000
## yes                215       75      194         437          3                 262               706     108 2000
## Sum                418      202      496         893          3                 511              1293     184 4000
        ftable(addmargins(table(y,default))) 
##     default   no unknown  yes  Sum
## y                                 
## no          1566     434    0 2000
## yes         1796     204    0 2000
## Sum         3362     638    0 4000
        ftable(addmargins(table(y,housing))) 
##     housing   no unknown  yes  Sum
## y                                 
## no           876      58 1066 2000
## yes          878      42 1080 2000
## Sum         1754     100 2146 4000
        ftable(addmargins(table(y,loan))) 
##     loan   no unknown  yes  Sum
## y                              
## no       1634      58  308 2000
## yes      1653      42  305 2000
## Sum      3287     100  613 4000
        ftable(addmargins(table(y,contact))) 
##     contact cellular telephone  Sum
## y                                  
## no              1224       776 2000
## yes             1638       362 2000
## Sum             2862      1138 4000
        ftable(addmargins(table(y,month))) 
##     month  apr  aug  dec  jul  jun  mar  may  nov  oct  sep  Sum
## y                                                               
## no         114  300    6  365  268   20  686  205   23   13 2000
## yes        234  284   32  269  231  120  384  191  139  116 2000
## Sum        348  584   38  634  499  140 1070  396  162  129 4000
        ftable(addmargins(table(y,day_of_week))) 
##     day_of_week  fri  mon  thu  tue  wed  Sum
## y                                            
## no               367  450  397  366  420 2000
## yes              350  339  449  442  420 2000
## Sum              717  789  846  808  840 4000
        ftable(addmargins(table(y,poutcome))) 
##     poutcome failure nonexistent success  Sum
## y                                            
## no               221        1755      24 2000
## yes              265        1366     369 2000
## Sum              486        3121     393 4000
#to get proportions that make sense
        prop.table(table(y,job),2)
##      job
## y        admin. blue-collar entrepreneur housemaid management   retired
##   no  0.4547206   0.6119017    0.5692308 0.5312500  0.5221843 0.2567050
##   yes 0.5452794   0.3880983    0.4307692 0.4687500  0.4778157 0.7432950
##      job
## y     self-employed  services   student technician unemployed   unknown
##   no      0.4963504 0.5641791 0.2753623  0.5375191  0.4181818 0.5000000
##   yes     0.5036496 0.4358209 0.7246377  0.4624809  0.5818182 0.5000000
        plot(y~job,col=c("red","blue"))

        prop.table(table(y,marital),2)
##      marital
## y      divorced   married    single   unknown
##   no  0.5343348 0.5139665 0.4603837 0.3750000
##   yes 0.4656652 0.4860335 0.5396163 0.6250000
        plot(y~marital,col=c("red","blue"))

        prop.table(table(y,education),2)
##      education
## y      basic.4y  basic.6y  basic.9y high.school illiterate professional.course
##   no  0.4856459 0.6287129 0.6088710   0.5106383  0.0000000           0.4872798
##   yes 0.5143541 0.3712871 0.3911290   0.4893617  1.0000000           0.5127202
##      education
## y     university.degree   unknown
##   no          0.4539830 0.4130435
##   yes         0.5460170 0.5869565
        plot(y~education,col=c("red","blue"))

        prop.table(table(y,default),2)
##      default
## y            no   unknown yes
##   no  0.4657942 0.6802508    
##   yes 0.5342058 0.3197492
        plot(y~default,col=c("red","blue"))

        prop.table(table(y,housing),2)
##      housing
## y            no   unknown       yes
##   no  0.4994299 0.5800000 0.4967381
##   yes 0.5005701 0.4200000 0.5032619
        plot(y~housing,col=c("red","blue"))

        prop.table(table(y,loan),2)
##      loan
## y            no   unknown       yes
##   no  0.4971098 0.5800000 0.5024470
##   yes 0.5028902 0.4200000 0.4975530
        plot(y~loan,col=c("red","blue"))

        prop.table(table(y,contact),2)
##      contact
## y      cellular telephone
##   no  0.4276730 0.6818981
##   yes 0.5723270 0.3181019
        plot(y~contact,col=c("red","blue"))

        prop.table(table(y,month),2)
##      month
## y           apr       aug       dec       jul       jun       mar       may
##   no  0.3275862 0.5136986 0.1578947 0.5757098 0.5370741 0.1428571 0.6411215
##   yes 0.6724138 0.4863014 0.8421053 0.4242902 0.4629259 0.8571429 0.3588785
##      month
## y           nov       oct       sep
##   no  0.5176768 0.1419753 0.1007752
##   yes 0.4823232 0.8580247 0.8992248
        plot(y~month,col=c("red","blue"))

        prop.table(table(y,day_of_week),2)
##      day_of_week
## y           fri       mon       thu       tue       wed
##   no  0.5118550 0.5703422 0.4692671 0.4529703 0.5000000
##   yes 0.4881450 0.4296578 0.5307329 0.5470297 0.5000000
        plot(y~day_of_week,col=c("red","blue"))

        prop.table(table(y,poutcome),2)
##      poutcome
## y       failure nonexistent   success
##   no  0.4547325   0.5623198 0.0610687
##   yes 0.5452675   0.4376802 0.9389313
        plot(y~poutcome,col=c("red","blue"))

Drawing corr plot to check collinearity between continous variables. However, for few, was not very clear if there is linear relationship. Putting additional scatter plots for those.

#Logistics Regression Assumption Check (Suchi)
#Overall EDA to check collinearity between continous variables

        ggpairs(bank.additional.full,columns=c(1,11:14,16:20),aes(colour=y))

#        ggpairs(bank.additional.sample,columns=c(1,11:14,16:20),aes(colour=y))

      ggplot(bank.additional.full, aes(x = emp.var.rate , y = cons.price.idx)) +  geom_point()  + xlab("employment variation rate - quarterly indicator") + ylab("consumer price index") +  ggtitle("employment variation rate vs consumer price index")  

      ggplot(bank.additional.full, aes(x = emp.var.rate , y = cons.conf.idx)) +  geom_point()  + xlab("employment variation rate - quarterly indicator") + ylab("consumer confidence index") + ggtitle("employment variation rate vs consumer confidence index")   

      ggplot(bank.additional.full, aes(x = emp.var.rate , y = nr.employed)) +  geom_point()  + xlab("employment variation rate - quarterly indicator") + ylab("number of employees")+  ggtitle("employment variation rate vs number of employees")  

      ggplot(bank.additional.full, aes(x = cons.price.idx , y = cons.conf.idx)) +  geom_point()  + xlab("consumer price index") + ylab("consumer confidence index") +  ggtitle("Relation between consumer price index and consumer confidence index")  

      ggplot(bank.additional.full, aes(x = cons.price.idx , y = nr.employed)) +  geom_point()  + xlab("consumer price index") + ylab("number of employees") +  ggtitle("Relation between consumer price index and number of employees")  

      ggplot(bank.additional.full, aes(x = cons.conf.idx , y = nr.employed)) +  geom_point()  + xlab("consumer confidence index") + ylab("number of employees") +  ggtitle("Relation between consumer confidence index and number of employees")  

Perform your logistic regression analysis and provide interpretation of the regression coefficients including hypothesis testing, and confidence intervals. For simplicity sake, you do not need to include interactions with this model. Comment on the practical vs statistical significance of the deemed important factors.

# Checking significance of individual variables in predicting outcome (Suchi)
      summary(glm(y~age , family="binomial",data=bank.additional.sample.train))        ##From boxplot, it did not look significant but from glm, it does look OK.
## 
## Call:
## glm(formula = y ~ age, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.355  -1.160  -1.100   1.199   1.276  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.381256   0.126289  -3.019  0.00254 **
## age          0.008963   0.002978   3.009  0.00262 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4149.5  on 2998  degrees of freedom
## AIC: 4153.5
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~job , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ job, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.622  -1.102  -1.002   1.206   1.363  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.13593    0.07173   1.895 0.058088 .  
## jobblue-collar   -0.56338    0.11175  -5.041 4.62e-07 ***
## jobentrepreneur  -0.54140    0.21636  -2.502 0.012339 *  
## jobhousemaid     -0.10926    0.24184  -0.452 0.651415    
## jobmanagement    -0.33478    0.15278  -2.191 0.028433 *  
## jobretired        0.85182    0.17482   4.873 1.10e-06 ***
## jobself-employed -0.20337    0.22392  -0.908 0.363751    
## jobservices      -0.33741    0.14620  -2.308 0.021001 *  
## jobstudent        0.86622    0.22866   3.788 0.000152 ***
## jobtechnician    -0.31678    0.11501  -2.754 0.005879 ** 
## jobunemployed     0.23876    0.23724   1.006 0.314213    
## jobunknown       -0.21004    0.39179  -0.536 0.591884    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4046.8  on 2988  degrees of freedom
## AIC: 4070.8
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~marital , family="binomial",data=bank.additional.sample.train))    ## Not at all
## 
## Call:
## glm(formula = y ~ marital, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.302  -1.152  -1.095   1.203   1.262  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -0.1977     0.1098  -1.800   0.0719 .
## maritalmarried   0.1366     0.1198   1.140   0.2543  
## maritalsingle    0.3260     0.1281   2.544   0.0110 *
## maritalunknown   0.4853     0.7716   0.629   0.5293  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4150.1  on 2996  degrees of freedom
## AIC: 4158.1
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~education , family="binomial",data=bank.additional.sample.train))  ## Meh
## 
## Call:
## glm(formula = y ~ education, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2933  -1.1774  -0.9448   1.1333   1.4294  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    0.10467    0.11449   0.914 0.360590    
## educationbasic.6y             -0.68003    0.20504  -3.317 0.000911 ***
## educationbasic.9y             -0.57376    0.15803  -3.631 0.000283 ***
## educationhigh.school          -0.16600    0.13767  -1.206 0.227886    
## educationilliterate           12.46139  187.49090   0.066 0.947008    
## educationprofessional.course  -0.10467    0.15373  -0.681 0.495962    
## educationuniversity.degree     0.06065    0.13138   0.462 0.644313    
## educationunknown               0.16359    0.20064   0.815 0.414871    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4113.0  on 2992  degrees of freedom
## AIC: 4129
## 
## Number of Fisher Scoring iterations: 11
      summary(glm(y~default , family="binomial",data=bank.additional.sample.train))    ## Okay, will get back to you!!!!
## 
## Call:
## glm(formula = y ~ default, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2298  -1.2298  -0.8657   1.1259   1.5252  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.12237    0.03992   3.066  0.00217 ** 
## defaultunknown -0.91083    0.10626  -8.572  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4080.3  on 2998  degrees of freedom
## AIC: 4084.3
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~housing , family="binomial",data=bank.additional.sample.train))    ## Not at all
## 
## Call:
## glm(formula = y ~ housing, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.176  -1.176  -1.033   1.178   1.329  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -0.01696    0.05554  -0.305    0.760
## housingunknown -0.33324    0.24097  -1.383    0.167
## housingyes      0.01451    0.07444   0.195    0.845
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4156.5  on 2997  degrees of freedom
## AIC: 4162.5
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~loan , family="binomial",data=bank.additional.sample.train))       ## Not at all
## 
## Call:
## glm(formula = y ~ loan, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.175  -1.175  -1.033   1.180   1.329  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.005675   0.040267  -0.141    0.888
## loanunknown -0.344527   0.237912  -1.448    0.148
## loanyes     -0.020527   0.101767  -0.202    0.840
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4156.5  on 2997  degrees of freedom
## AIC: 4162.5
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~contact , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ contact, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.295  -1.295  -0.867   1.064   1.524  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.27224    0.04352   6.255 3.97e-10 ***
## contacttelephone -1.05693    0.08584 -12.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3998.1  on 2998  degrees of freedom
## AIC: 4002.1
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~month , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ month, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1710  -1.0212  -0.9368   1.2247   1.4388  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.7510     0.1324   5.673 1.40e-08 ***
## monthaug     -0.8285     0.1632  -5.075 3.87e-07 ***
## monthdec      0.7531     0.4703   1.601 0.109337    
## monthjul     -1.1302     0.1620  -6.976 3.05e-12 ***
## monthjun     -0.8616     0.1691  -5.094 3.50e-07 ***
## monthmar      1.0408     0.3087   3.371 0.000748 ***
## monthmay     -1.3472     0.1514  -8.898  < 2e-16 ***
## monthnov     -0.8711     0.1758  -4.955 7.22e-07 ***
## monthoct      0.9934     0.2877   3.453 0.000555 ***
## monthsep      1.5062     0.3745   4.022 5.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3824.0  on 2990  degrees of freedom
## AIC: 3844
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~day_of_week , family="binomial",data=bank.additional.sample.train))## Not at all
## 
## Call:
## glm(formula = y ~ day_of_week, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.257  -1.159  -1.031   1.196   1.331  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -0.050552   0.084999  -0.595   0.5520  
## day_of_weekmon -0.303993   0.120269  -2.528   0.0115 *
## day_of_weekthu  0.194100   0.116136   1.671   0.0947 .
## day_of_weektue  0.235900   0.117823   2.002   0.0453 *
## day_of_weekwed  0.005959   0.116608   0.051   0.9592  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4132.1  on 2995  degrees of freedom
## AIC: 4142.1
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~duration , family="binomial",data=bank.additional.sample.train))   ## This is not a true predictor
## 
## Call:
## glm(formula = y ~ duration, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4693  -0.8492  -0.6090   0.9577   1.7715  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.6334204  0.0758929  -21.52   <2e-16 ***
## duration     0.0047279  0.0002114   22.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3263.2  on 2998  degrees of freedom
## AIC: 3267.2
## 
## Number of Fisher Scoring iterations: 5
      summary(glm(y~campaign , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ campaign, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2313  -1.1831  -0.6014   1.1244   2.2098  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.23844    0.05485   4.347 1.38e-05 ***
## campaign    -0.11257    0.01854  -6.071 1.27e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4114.0  on 2998  degrees of freedom
## AIC: 4118
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~pdays , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ pdays, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.377  -1.082  -1.082   1.276   1.276  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.7706092  0.2380557   11.64   <2e-16 ***
## pdays       -0.0030020  0.0002415  -12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3828.4  on 2998  degrees of freedom
## AIC: 3832.4
## 
## Number of Fisher Scoring iterations: 5
      summary(glm(y~previous , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ previous, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.332  -1.060  -1.060   1.300   1.300  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.28364    0.04116  -6.891 5.55e-12 ***
## previous     0.97806    0.07613  12.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3926.0  on 2998  degrees of freedom
## AIC: 3930
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~poutcome , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ poutcome, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.389  -1.060  -1.060   1.299   1.299  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.2195     0.1052   2.087   0.0369 *  
## poutcomenonexistent  -0.5009     0.1132  -4.427 9.58e-06 ***
## poutcomesuccess       2.5750     0.2711   9.499  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3829.7  on 2997  degrees of freedom
## AIC: 3835.7
## 
## Number of Fisher Scoring iterations: 5
      summary(glm(y~emp.var.rate , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ emp.var.rate, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9118  -0.8277  -0.7697   0.8810   1.6500  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.27253    0.04198  -6.492 8.49e-11 ***
## emp.var.rate -0.56606    0.02493 -22.710  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3557.8  on 2998  degrees of freedom
## AIC: 3561.8
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~cons.price.idx , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ cons.price.idx, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5416  -1.0536  -0.9142   1.1056   1.5547  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    61.13741    5.65717   10.81   <2e-16 ***
## cons.price.idx -0.65414    0.06051  -10.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4036.8  on 2998  degrees of freedom
## AIC: 4040.8
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~cons.conf.idx , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ cons.conf.idx, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.363  -1.149  -1.037   1.185   1.336  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.317221   0.279983   4.705 2.54e-06 ***
## cons.conf.idx 0.033144   0.006893   4.809 1.52e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 4135.3  on 2998  degrees of freedom
## AIC: 4139.3
## 
## Number of Fisher Scoring iterations: 3
      summary(glm(y~euribor3m , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ euribor3m, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7086  -0.7992  -0.7800   0.8364   1.6539  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.52188    0.07714   19.73   <2e-16 ***
## euribor3m   -0.51450    0.02183  -23.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3531.1  on 2998  degrees of freedom
## AIC: 3535.1
## 
## Number of Fisher Scoring iterations: 4
      summary(glm(y~nr.employed , family="binomial",data=bank.additional.sample.train))
## 
## Call:
## glm(formula = y ~ nr.employed, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1351  -0.8903  -0.7504   0.9801   1.6762  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 63.9997285  2.7051431   23.66   <2e-16 ***
## nr.employed -0.0124563  0.0005258  -23.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3455.0  on 2998  degrees of freedom
## AIC: 3459
## 
## Number of Fisher Scoring iterations: 4
#Suchi
#Type of Selection (Manual / Intuition)
#fitting a simple model by using only those predictors which appeared to be significant in EDA and individual variable check -- ##Kind of Manual backward selection
  
        simple.logistics.all.significant<-glm(y~ job + contact + month + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all.significant)
## 
## Call:
## glm(formula = y ~ job + contact + month + campaign + pdays + 
##     previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7513  -0.8615  -0.6019   0.8025   1.9204  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.320e+02  8.705e+01  -3.814 0.000137 ***
## jobblue-collar      -3.822e-03  1.307e-01  -0.029 0.976663    
## jobentrepreneur      3.588e-02  2.469e-01   0.145 0.884460    
## jobhousemaid         7.902e-02  2.861e-01   0.276 0.782418    
## jobmanagement       -8.257e-02  1.811e-01  -0.456 0.648444    
## jobretired           1.159e-01  2.127e-01   0.545 0.585903    
## jobself-employed     1.419e-01  2.606e-01   0.545 0.586016    
## jobservices          1.021e-01  1.678e-01   0.608 0.543096    
## jobstudent           1.204e-01  2.710e-01   0.444 0.656800    
## jobtechnician       -1.055e-01  1.371e-01  -0.770 0.441590    
## jobunemployed        2.671e-01  2.839e-01   0.941 0.346835    
## jobunknown          -2.010e-01  4.852e-01  -0.414 0.678753    
## contacttelephone    -6.506e-01  1.684e-01  -3.864 0.000112 ***
## monthaug             7.814e-01  3.313e-01   2.358 0.018353 *  
## monthdec            -3.174e-01  5.342e-01  -0.594 0.552382    
## monthjul             8.123e-02  2.197e-01   0.370 0.711543    
## monthjun            -1.077e+00  3.052e-01  -3.528 0.000418 ***
## monthmar             1.580e+00  3.764e-01   4.197 2.71e-05 ***
## monthmay            -3.981e-01  1.880e-01  -2.117 0.034230 *  
## monthnov            -3.713e-01  2.658e-01  -1.397 0.162413    
## monthoct             1.584e-01  3.669e-01   0.432 0.666055    
## monthsep             9.180e-01  4.913e-01   1.869 0.061670 .  
## campaign            -2.066e-02  1.842e-02  -1.122 0.262045    
## pdays               -1.036e-03  8.363e-04  -1.239 0.215338    
## previous             8.733e-02  2.103e-01   0.415 0.677964    
## poutcomenonexistent  5.814e-01  2.758e-01   2.108 0.035037 *  
## poutcomesuccess      8.731e-01  8.334e-01   1.048 0.294771    
## emp.var.rate        -1.964e+00  3.420e-01  -5.743 9.29e-09 ***
## cons.price.idx       2.726e+00  5.864e-01   4.649 3.33e-06 ***
## cons.conf.idx        5.600e-02  2.076e-02   2.698 0.006974 ** 
## euribor3m            9.084e-02  3.059e-01   0.297 0.766502    
## nr.employed          1.538e-02  6.945e-03   2.214 0.026835 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3210.2  on 2968  degrees of freedom
## AIC: 3274.2
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all.significant)

        
#Removing job variable as it does not look statistically significant in model (Got lowest AIC 3270 here)
        simple.logistics.all1 <-glm(y~ contact + month + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all1)
## 
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + previous + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7664  -0.8295  -0.6153   0.8030   1.9294  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.389e+02  8.668e+01  -3.910 9.23e-05 ***
## contacttelephone    -6.573e-01  1.679e-01  -3.914 9.06e-05 ***
## monthaug             7.622e-01  3.298e-01   2.311 0.020811 *  
## monthdec            -3.211e-01  5.328e-01  -0.603 0.546708    
## monthjul             7.599e-02  2.188e-01   0.347 0.728324    
## monthjun            -1.105e+00  3.038e-01  -3.637 0.000276 ***
## monthmar             1.565e+00  3.750e-01   4.174 3.00e-05 ***
## monthmay            -4.078e-01  1.864e-01  -2.187 0.028734 *  
## monthnov            -3.809e-01  2.652e-01  -1.436 0.151034    
## monthoct             1.435e-01  3.659e-01   0.392 0.695019    
## monthsep             9.011e-01  4.901e-01   1.839 0.065967 .  
## campaign            -2.071e-02  1.836e-02  -1.128 0.259254    
## pdays               -1.024e-03  8.350e-04  -1.226 0.220035    
## previous             8.477e-02  2.094e-01   0.405 0.685613    
## poutcomenonexistent  5.872e-01  2.750e-01   2.135 0.032753 *  
## poutcomesuccess      8.967e-01  8.324e-01   1.077 0.281411    
## emp.var.rate        -1.988e+00  3.409e-01  -5.832 5.46e-09 ***
## cons.price.idx       2.775e+00  5.839e-01   4.752 2.01e-06 ***
## cons.conf.idx        5.793e-02  2.068e-02   2.802 0.005082 ** 
## euribor3m            7.810e-02  3.047e-01   0.256 0.797719    
## nr.employed          1.586e-02  6.913e-03   2.294 0.021786 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3214.1  on 2979  degrees of freedom
## AIC: 3256.1
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all1)


#Removing month variable as it does not look practically significant 
        simple.logistics.all2 <-glm(y~ contact + campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all2)
## 
## Call:
## glm(formula = y ~ contact + campaign + pdays + previous + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + 
##     nr.employed, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6757  -0.8045  -0.5392   0.9808   2.0543  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.014e+02  3.388e+01  -2.994  0.00276 ** 
## contacttelephone    -8.871e-01  1.277e-01  -6.946 3.76e-12 ***
## campaign            -1.367e-02  1.800e-02  -0.760  0.44742    
## pdays               -9.384e-04  8.256e-04  -1.137  0.25571    
## previous             1.421e-01  2.081e-01   0.683  0.49478    
## poutcomenonexistent  8.014e-01  2.708e-01   2.960  0.00308 ** 
## poutcomesuccess      1.015e+00  8.271e-01   1.227  0.21983    
## emp.var.rate        -9.492e-01  1.573e-01  -6.036 1.58e-09 ***
## cons.price.idx       1.157e+00  2.250e-01   5.143 2.71e-07 ***
## cons.conf.idx        5.223e-02  1.324e-02   3.944 8.03e-05 ***
## euribor3m            1.930e-01  1.776e-01   1.086  0.27732    
## nr.employed         -1.009e-03  3.149e-03  -0.320  0.74875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3295.0  on 2988  degrees of freedom
## AIC: 3319
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all2)

        
#Removing campaign variable as it does not look statistically significant in model
        simple.logistics.all3 <-glm(y~ contact +  pdays + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all3)
## 
## Call:
## glm(formula = y ~ contact + pdays + previous + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx + euribor3m + nr.employed, 
##     family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6760  -0.8024  -0.5763   0.9853   2.0259  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.005e+02  3.387e+01  -2.967  0.00301 ** 
## contacttelephone    -8.912e-01  1.276e-01  -6.987 2.81e-12 ***
## pdays               -9.409e-04  8.257e-04  -1.139  0.25451    
## previous             1.418e-01  2.080e-01   0.682  0.49523    
## poutcomenonexistent  8.011e-01  2.707e-01   2.960  0.00308 ** 
## poutcomesuccess      1.013e+00  8.272e-01   1.225  0.22061    
## emp.var.rate        -9.590e-01  1.568e-01  -6.116 9.61e-10 ***
## cons.price.idx       1.155e+00  2.250e-01   5.133 2.85e-07 ***
## cons.conf.idx        5.187e-02  1.324e-02   3.918 8.94e-05 ***
## euribor3m            2.060e-01  1.769e-01   1.164  0.24432    
## nr.employed         -1.171e-03  3.143e-03  -0.373  0.70948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3295.5  on 2989  degrees of freedom
## AIC: 3317.5
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all3)

        
#Removing pdays variable as it does not look statistically significant in model
        simple.logistics.all4 <-glm(y~ contact + previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all4)
## 
## Call:
## glm(formula = y ~ contact + previous + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx + euribor3m + nr.employed, 
##     family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6590  -0.8059  -0.5752   0.9858   2.0284  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.015e+02  3.387e+01  -2.998  0.00272 ** 
## contacttelephone    -8.951e-01  1.276e-01  -7.018 2.25e-12 ***
## previous             2.466e-01  1.948e-01   1.266  0.20554    
## poutcomenonexistent  8.970e-01  2.637e-01   3.401  0.00067 ***
## poutcomesuccess      1.885e+00  2.858e-01   6.595 4.25e-11 ***
## emp.var.rate        -9.647e-01  1.566e-01  -6.159 7.30e-10 ***
## cons.price.idx       1.160e+00  2.250e-01   5.157 2.51e-07 ***
## cons.conf.idx        5.198e-02  1.325e-02   3.924 8.71e-05 ***
## euribor3m            2.136e-01  1.769e-01   1.208  0.22708    
## nr.employed         -1.271e-03  3.145e-03  -0.404  0.68598    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3297.1  on 2990  degrees of freedom
## AIC: 3317.1
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all4)

        
#Removing previous variable as it does not look statistically significant in model
        simple.logistics.all5 <-glm(y~ contact + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all5)
## 
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed, family = "binomial", 
##     data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7025  -0.8173  -0.5716   0.9871   2.0373  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.009e+02  3.388e+01  -2.979  0.00289 ** 
## contacttelephone    -9.103e-01  1.272e-01  -7.154 8.42e-13 ***
## poutcomenonexistent  6.087e-01  1.361e-01   4.474 7.67e-06 ***
## poutcomesuccess      1.942e+00  2.828e-01   6.866 6.58e-12 ***
## emp.var.rate        -9.652e-01  1.568e-01  -6.157 7.40e-10 ***
## cons.price.idx       1.182e+00  2.247e-01   5.258 1.46e-07 ***
## cons.conf.idx        5.246e-02  1.323e-02   3.965 7.35e-05 ***
## euribor3m            2.263e-01  1.765e-01   1.282  0.19992    
## nr.employed         -1.723e-03  3.128e-03  -0.551  0.58182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3298.8  on 2991  degrees of freedom
## AIC: 3316.8
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all5)

        
#Removing nr.employed variable as it does not look statistically significant in model and it looked like related to emp.var.rate
        simple.logistics.all6 <-glm(y~ contact + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.all6)
## 
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7033  -0.8110  -0.5697   0.9841   2.0445  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.178e+02  1.465e+01  -8.039 9.08e-16 ***
## contacttelephone    -9.235e-01  1.250e-01  -7.386 1.51e-13 ***
## poutcomenonexistent  6.076e-01  1.361e-01   4.465 8.01e-06 ***
## poutcomesuccess      1.948e+00  2.826e-01   6.893 5.44e-12 ***
## emp.var.rate        -9.857e-01  1.526e-01  -6.461 1.04e-10 ***
## cons.price.idx       1.271e+00  1.545e-01   8.228  < 2e-16 ***
## cons.conf.idx        5.754e-02  9.497e-03   6.059 1.37e-09 ***
## euribor3m            1.537e-01  1.178e-01   1.304    0.192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3299.1  on 2992  degrees of freedom
## AIC: 3315.1
## 
## Number of Fisher Scoring iterations: 5
        #confint(simple.logistics.all6)

        
#Removing euribor3m variable as it does not look statistically significant in model and it looked like related to emp.var.rate 
        simple.logistics<-glm(y~ contact + poutcome + emp.var.rate + cons.conf.idx + cons.price.idx , family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics)
## 
## Call:
## glm(formula = y ~ contact + poutcome + emp.var.rate + cons.conf.idx + 
##     cons.price.idx, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6698  -0.7981  -0.5786   0.9835   1.9998  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.068e+02  1.200e+01  -8.903  < 2e-16 ***
## contacttelephone    -8.733e-01  1.189e-01  -7.346 2.04e-13 ***
## poutcomenonexistent  6.184e-01  1.360e-01   4.548 5.42e-06 ***
## poutcomesuccess      1.931e+00  2.822e-01   6.842 7.79e-12 ***
## emp.var.rate        -7.967e-01  4.555e-02 -17.490  < 2e-16 ***
## cons.conf.idx        6.046e-02  9.209e-03   6.565 5.19e-11 ***
## cons.price.idx       1.161e+00  1.294e-01   8.979  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3300.8  on 2993  degrees of freedom
## AIC: 3314.8
## 
## Number of Fisher Scoring iterations: 5
        confint(simple.logistics)
## Waiting for profiling to be done...
##                             2.5 %       97.5 %
## (Intercept)         -130.79487834 -83.71427806
## contacttelephone      -1.10795893  -0.64171154
## poutcomenonexistent    0.35273669   0.88611246
## poutcomesuccess        1.40193296   2.51426260
## emp.var.rate          -0.88739499  -0.70873877
## cons.conf.idx          0.04258315   0.07870058
## cons.price.idx         0.91230508   1.41981182

Since our goal is to predict the outcome, removing the duration variable from stepwise full model even though it looked significant in the full model. default is considered in stepwise model. However, we do not want to consider it because considering unknown status as predictor would be risky. And defaultno is not significant. Practically I do not believe customer response depends on which month they were contacted (could be just a coincidence). But for now keeping it in the model.

Finalizing the below equation as it had lowest AIC among Forward, Backward and Stepwise model. Backward and Stepwise has exactly same set of predictor variables.

y ~ age + default + contact + month + day_of_week + campaign + pdays + previous + emp.var.rate + cons.price.idx + cons.conf.idx

#Suchi
#Type of Selection : Stepwise, Forward, Backward


# First Fit the full model (Excluding duration variable)
        full.log.model<-glm(y~.,family="binomial",data=bank.additional.sample.train[-11])

# Stepwise model
      step.model <- stepAIC(full.log.model, direction = "both",  trace = FALSE)
      summary(step.model)
## 
## Call:
## glm(formula = y ~ housing + contact + month + day_of_week + pdays + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     nr.employed, family = "binomial", data = bank.additional.sample.train[-11])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7924  -0.8471  -0.5624   0.7967   1.9671  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.512e+02  7.392e+01  -4.752 2.02e-06 ***
## housingunknown      -5.957e-01  2.981e-01  -1.999 0.045661 *  
## housingyes          -1.858e-01  8.868e-02  -2.095 0.036127 *  
## contacttelephone    -6.804e-01  1.681e-01  -4.047 5.19e-05 ***
## monthaug             7.722e-01  3.322e-01   2.325 0.020096 *  
## monthdec            -3.266e-01  5.235e-01  -0.624 0.532665    
## monthjul             7.834e-02  2.195e-01   0.357 0.721181    
## monthjun            -1.116e+00  2.947e-01  -3.787 0.000153 ***
## monthmar             1.627e+00  3.570e-01   4.557 5.20e-06 ***
## monthmay            -3.861e-01  1.865e-01  -2.071 0.038403 *  
## monthnov            -3.376e-01  2.171e-01  -1.555 0.119908    
## monthoct             1.780e-01  3.188e-01   0.558 0.576521    
## monthsep             9.149e-01  4.778e-01   1.915 0.055500 .  
## day_of_weekmon      -3.421e-01  1.424e-01  -2.403 0.016270 *  
## day_of_weekthu       1.604e-01  1.378e-01   1.164 0.244428    
## day_of_weektue       9.308e-02  1.414e-01   0.658 0.510471    
## day_of_weekwed       7.870e-02  1.382e-01   0.569 0.569116    
## pdays               -1.159e-03  7.874e-04  -1.472 0.140889    
## poutcomenonexistent  5.134e-01  1.440e-01   3.566 0.000363 ***
## poutcomesuccess      7.860e-01  8.078e-01   0.973 0.330524    
## emp.var.rate        -1.988e+00  3.377e-01  -5.888 3.91e-09 ***
## cons.price.idx       2.849e+00  5.387e-01   5.289 1.23e-07 ***
## cons.conf.idx        6.266e-02  1.349e-02   4.644 3.42e-06 ***
## nr.employed          1.704e-02  4.739e-03   3.594 0.000325 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3192.4  on 2976  degrees of freedom
## AIC: 3240.4
## 
## Number of Fisher Scoring iterations: 5
      AIC(step.model)
## [1] 3240.412
      exp(cbind("Odds ratio" = coef(step.model), confint.default(step.model, level = 0.95)))
##                        Odds ratio         2.5 %       97.5 %
## (Intercept)         2.897335e-153 3.499620e-216 2.398704e-90
## housingunknown       5.511613e-01  3.072894e-01 9.885755e-01
## housingyes           8.304239e-01  6.979415e-01 9.880539e-01
## contacttelephone     5.064283e-01  3.642625e-01 7.040793e-01
## monthaug             2.164542e+00  1.128753e+00 4.150813e+00
## monthdec             7.213405e-01  2.585371e-01 2.012601e+00
## monthjul             1.081489e+00  7.033559e-01 1.662912e+00
## monthjun             3.276301e-01  1.838895e-01 5.837283e-01
## monthmar             5.087925e+00  2.527142e+00 1.024358e+01
## monthmay             6.796717e-01  4.715780e-01 9.795912e-01
## monthnov             7.134540e-01  4.661905e-01 1.091864e+00
## monthoct             1.194868e+00  6.396850e-01 2.231893e+00
## monthsep             2.496643e+00  9.787197e-01 6.368754e+00
## day_of_weekmon       7.102951e-01  5.373498e-01 9.389026e-01
## day_of_weekthu       1.173992e+00  8.961087e-01 1.538048e+00
## day_of_weektue       1.097545e+00  8.318350e-01 1.448129e+00
## day_of_weekwed       1.081880e+00  8.251246e-01 1.418531e+00
## pdays                9.988412e-01  9.973008e-01 1.000384e+00
## poutcomenonexistent  1.670974e+00  1.260140e+00 2.215751e+00
## poutcomesuccess      2.194611e+00  4.505936e-01 1.068882e+01
## emp.var.rate         1.369154e-01  7.063030e-02 2.654076e-01
## cons.price.idx       1.727765e+01  6.010540e+00 4.966561e+01
## cons.conf.idx        1.064666e+00  1.036878e+00 1.093198e+00
## nr.employed          1.017181e+00  1.007776e+00 1.026674e+00
      vif(step.model)  
##                      GVIF Df GVIF^(1/(2*Df))
## housing          1.026534  2        1.006568
## contact          3.067376  1        1.751393
## month           44.211482  9        1.234294
## day_of_week      1.050005  4        1.006118
## pdays           10.030446  1        3.167088
## poutcome        11.923293  2        1.858228
## emp.var.rate   163.142930  1       12.772742
## cons.price.idx  53.864403  1        7.339237
## cons.conf.idx    2.335350  1        1.528185
## nr.employed     73.248971  1        8.558561
# forward regression model
      foward.model <- stepAIC(full.log.model, direction = "forward", trace = FALSE)
      AIC(foward.model)
## [1] 3276.491
      summary(foward.model)
## 
## Call:
## glm(formula = y ~ age + job + marital + education + default + 
##     housing + loan + contact + month + day_of_week + campaign + 
##     pdays + previous + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed, family = "binomial", 
##     data = bank.additional.sample.train[-11])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7426  -0.8526  -0.5169   0.7791   1.9976  
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -3.255e+02  8.812e+01  -3.694 0.000221 ***
## age                          -2.827e-03  5.364e-03  -0.527 0.598110    
## jobblue-collar                1.936e-01  1.656e-01   1.169 0.242272    
## jobentrepreneur               4.790e-02  2.562e-01   0.187 0.851692    
## jobhousemaid                  2.028e-01  3.088e-01   0.657 0.511378    
## jobmanagement                -1.241e-01  1.852e-01  -0.670 0.502542    
## jobretired                    3.352e-01  2.579e-01   1.300 0.193764    
## jobself-employed              1.303e-01  2.675e-01   0.487 0.626233    
## jobservices                   2.295e-01  1.816e-01   1.264 0.206383    
## jobstudent                    1.812e-01  2.865e-01   0.632 0.527119    
## jobtechnician                -4.346e-02  1.541e-01  -0.282 0.777878    
## jobunemployed                 3.703e-01  2.931e-01   1.264 0.206389    
## jobunknown                   -7.066e-02  4.999e-01  -0.141 0.887588    
## maritalmarried                1.892e-01  1.449e-01   1.306 0.191436    
## maritalsingle                 1.085e-01  1.662e-01   0.653 0.513944    
## maritalunknown                5.728e-01  9.439e-01   0.607 0.543907    
## educationbasic.6y            -1.761e-01  2.420e-01  -0.728 0.466658    
## educationbasic.9y            -1.378e-01  1.948e-01  -0.707 0.479377    
## educationhigh.school          3.488e-02  1.945e-01   0.179 0.857658    
## educationilliterate           1.298e+01  2.641e+02   0.049 0.960817    
## educationprofessional.course  3.823e-02  2.175e-01   0.176 0.860467    
## educationuniversity.degree    2.333e-01  1.981e-01   1.178 0.238819    
## educationunknown              1.316e-01  2.586e-01   0.509 0.610940    
## defaultunknown               -1.383e-01  1.264e-01  -1.094 0.273960    
## housingunknown               -5.528e-01  3.018e-01  -1.832 0.066964 .  
## housingyes                   -1.863e-01  8.947e-02  -2.082 0.037358 *  
## loanunknown                          NA         NA      NA       NA    
## loanyes                       3.657e-02  1.212e-01   0.302 0.762963    
## contacttelephone             -6.546e-01  1.696e-01  -3.859 0.000114 ***
## monthaug                      7.271e-01  3.361e-01   2.164 0.030496 *  
## monthdec                     -3.786e-01  5.391e-01  -0.702 0.482478    
## monthjul                      1.053e-01  2.226e-01   0.473 0.636164    
## monthjun                     -1.046e+00  3.099e-01  -3.375 0.000738 ***
## monthmar                      1.594e+00  3.797e-01   4.197 2.70e-05 ***
## monthmay                     -3.690e-01  1.911e-01  -1.931 0.053508 .  
## monthnov                     -3.915e-01  2.687e-01  -1.457 0.145151    
## monthoct                      1.187e-01  3.706e-01   0.320 0.748747    
## monthsep                      8.660e-01  4.946e-01   1.751 0.079961 .  
## day_of_weekmon               -3.535e-01  1.434e-01  -2.466 0.013679 *  
## day_of_weekthu                1.449e-01  1.390e-01   1.043 0.296987    
## day_of_weektue                8.182e-02  1.429e-01   0.573 0.566843    
## day_of_weekwed                6.741e-02  1.396e-01   0.483 0.629126    
## campaign                     -1.860e-02  1.848e-02  -1.006 0.314334    
## pdays                        -1.030e-03  8.419e-04  -1.224 0.221133    
## previous                      8.111e-02  2.106e-01   0.385 0.700062    
## poutcomenonexistent           5.676e-01  2.768e-01   2.051 0.040277 *  
## poutcomesuccess               8.656e-01  8.383e-01   1.032 0.301843    
## emp.var.rate                 -1.943e+00  3.450e-01  -5.630 1.80e-08 ***
## cons.price.idx                2.682e+00  5.935e-01   4.519 6.21e-06 ***
## cons.conf.idx                 5.701e-02  2.100e-02   2.715 0.006624 ** 
## euribor3m                     9.952e-02  3.091e-01   0.322 0.747487    
## nr.employed                   1.491e-02  7.028e-03   2.122 0.033838 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3174.5  on 2949  degrees of freedom
## AIC: 3276.5
## 
## Number of Fisher Scoring iterations: 12
# backward regression model
      backward.model <- stepAIC(full.log.model, direction = "backward", trace = FALSE)
      AIC(backward.model)
## [1] 3240.412
      summary(backward.model)
## 
## Call:
## glm(formula = y ~ housing + contact + month + day_of_week + pdays + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     nr.employed, family = "binomial", data = bank.additional.sample.train[-11])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7924  -0.8471  -0.5624   0.7967   1.9671  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.512e+02  7.392e+01  -4.752 2.02e-06 ***
## housingunknown      -5.957e-01  2.981e-01  -1.999 0.045661 *  
## housingyes          -1.858e-01  8.868e-02  -2.095 0.036127 *  
## contacttelephone    -6.804e-01  1.681e-01  -4.047 5.19e-05 ***
## monthaug             7.722e-01  3.322e-01   2.325 0.020096 *  
## monthdec            -3.266e-01  5.235e-01  -0.624 0.532665    
## monthjul             7.834e-02  2.195e-01   0.357 0.721181    
## monthjun            -1.116e+00  2.947e-01  -3.787 0.000153 ***
## monthmar             1.627e+00  3.570e-01   4.557 5.20e-06 ***
## monthmay            -3.861e-01  1.865e-01  -2.071 0.038403 *  
## monthnov            -3.376e-01  2.171e-01  -1.555 0.119908    
## monthoct             1.780e-01  3.188e-01   0.558 0.576521    
## monthsep             9.149e-01  4.778e-01   1.915 0.055500 .  
## day_of_weekmon      -3.421e-01  1.424e-01  -2.403 0.016270 *  
## day_of_weekthu       1.604e-01  1.378e-01   1.164 0.244428    
## day_of_weektue       9.308e-02  1.414e-01   0.658 0.510471    
## day_of_weekwed       7.870e-02  1.382e-01   0.569 0.569116    
## pdays               -1.159e-03  7.874e-04  -1.472 0.140889    
## poutcomenonexistent  5.134e-01  1.440e-01   3.566 0.000363 ***
## poutcomesuccess      7.860e-01  8.078e-01   0.973 0.330524    
## emp.var.rate        -1.988e+00  3.377e-01  -5.888 3.91e-09 ***
## cons.price.idx       2.849e+00  5.387e-01   5.289 1.23e-07 ***
## cons.conf.idx        6.266e-02  1.349e-02   4.644 3.42e-06 ***
## nr.employed          1.704e-02  4.739e-03   3.594 0.000325 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3192.4  on 2976  degrees of freedom
## AIC: 3240.4
## 
## Number of Fisher Scoring iterations: 5
#fitting a simple model by using only 4 predictors which appeared to be significant in EDA and p-values from previous step (Hollie)
        simple.logistics.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx, family="binomial",data=bank.additional.sample.train)
        summary(simple.logistics.trained)
## 
## Call:
## glm(formula = y ~ euribor3m + cons.price.idx + poutcome + cons.conf.idx, 
##     family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5747  -0.8565  -0.6697   0.9747   1.8015  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -25.517362   8.533832  -2.990  0.00279 ** 
## euribor3m            -0.581088   0.034205 -16.988  < 2e-16 ***
## cons.price.idx        0.309276   0.092618   3.339  0.00084 ***
## poutcomenonexistent   0.521974   0.133147   3.920 8.84e-05 ***
## poutcomesuccess       2.049307   0.279601   7.329 2.31e-13 ***
## cons.conf.idx         0.054159   0.008641   6.268 3.66e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3374.7  on 2994  degrees of freedom
## AIC: 3386.7
## 
## Number of Fisher Scoring iterations: 5
        confint(simple.logistics.trained)      
## Waiting for profiling to be done...
##                            2.5 %      97.5 %
## (Intercept)         -42.39522044 -8.92090766
## euribor3m            -0.64907943 -0.51492579
## cons.price.idx        0.12923340  0.49253047
## poutcomenonexistent   0.26144451  0.78365901
## poutcomesuccess       1.52626861  2.62844159
## cons.conf.idx         0.03735675  0.07124337
# Performance comparison for simple logistic model (Kenny)

        #simple.logistics.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx, family="binomial",data=bank.additional.sample.train)
        fit.simple.logistics.pred<-predict(step.model, newdata = bank.additional.sample.test, type = "response")
 
#Suchi (removing 1 record from test where factor default )

#fit.simple.logistics.pred<-predict(step.model, newdata = bank.additional.sample.test, type = "response")
        
        cutoff<-0.5
        class.simple.logistics<-factor(ifelse(fit.simple.logistics.pred>cutoff,"yes","no"),levels=c("no","yes"))

        conf.simple.logistics<-confusionMatrix(table(class.simple.logistics,bank.additional.sample.test$y), positive = 'yes')
        conf.simple.logistics
## Confusion Matrix and Statistics
## 
##                       
## class.simple.logistics  no yes
##                    no  411 208
##                    yes  76 305
##                                           
##                Accuracy : 0.716           
##                  95% CI : (0.6869, 0.7438)
##     No Information Rate : 0.513           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4355          
##                                           
##  Mcnemar's Test P-Value : 7.639e-15       
##                                           
##             Sensitivity : 0.5945          
##             Specificity : 0.8439          
##          Pos Pred Value : 0.8005          
##          Neg Pred Value : 0.6640          
##              Prevalence : 0.5130          
##          Detection Rate : 0.3050          
##    Detection Prevalence : 0.3810          
##       Balanced Accuracy : 0.7192          
##                                           
##        'Positive' Class : yes             
## 
        results.simple.logistics<-prediction(fit.simple.logistics.pred,bank.additional.sample.test$y,label.ordering=c("no","yes"))
        roc.simple.logistics <- performance(results.simple.logistics, measure = "tpr", x.measure = "fpr")
        plot(roc.simple.logistics,colorize = TRUE)
        abline(a=0, b=1)

Removing duration variable here before trying LASSO to compare the performance with stepwise

# LASSO (Suchi)
        dat.train.x <- model.matrix(y ~ .,bank.additional.sample.train[-11])
        dat.train.y<-bank.additional.sample.train[,21]
        cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
        plot(cvfit)

        coef(cvfit, s = "lambda.min")
## 54 x 1 sparse Matrix of class "dgCMatrix"
##                                          1
## (Intercept)                  -78.261787018
## (Intercept)                    .          
## age                           -0.001637056
## jobblue-collar                 0.087205824
## jobentrepreneur                .          
## jobhousemaid                   0.061354472
## jobmanagement                 -0.129059410
## jobretired                     0.243481422
## jobself-employed               0.068456428
## jobservices                    0.143912659
## jobstudent                     0.150250085
## jobtechnician                 -0.065976214
## jobunemployed                  0.267901150
## jobunknown                     .          
## maritalmarried                 0.113492457
## maritalsingle                  0.029054379
## maritalunknown                 0.293777788
## educationbasic.6y             -0.150084551
## educationbasic.9y             -0.117252962
## educationhigh.school           .          
## educationilliterate            2.737057683
## educationprofessional.course   .          
## educationuniversity.degree     0.158908707
## educationunknown               0.080193552
## defaultunknown                -0.132160923
## defaultyes                     .          
## housingunknown                -0.406167726
## housingyes                    -0.169472352
## loanunknown                   -0.079018009
## loanyes                        0.005044221
## contacttelephone              -0.416196425
## monthaug                       .          
## monthdec                      -0.504244424
## monthjul                       0.076878962
## monthjun                      -0.297748659
## monthmar                       1.000946604
## monthmay                      -0.631007335
## monthnov                      -0.420164077
## monthoct                       0.149219311
## monthsep                       0.101023272
## day_of_weekmon                -0.373512545
## day_of_weekthu                 0.087494087
## day_of_weektue                 0.019468612
## day_of_weekwed                 0.005810514
## campaign                      -0.019773486
## pdays                         -0.001213301
## previous                       .          
## poutcomenonexistent            0.433379460
## poutcomesuccess                0.597784166
## emp.var.rate                  -0.725772662
## cons.price.idx                 0.866413405
## cons.conf.idx                  0.045285626
## euribor3m                      .          
## nr.employed                    .
        print("CV Error Rate:")
## [1] "CV Error Rate:"
        cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
## [1] 0.261
#Optimal penalty
        print("Penalty Value:")
## [1] "Penalty Value:"
        cvfit$lambda.min
## [1] 0.001412619
#For final model predictions go ahead and refit lasso using entire
#data set
        lasso.log.final<-glmnet(dat.train.x, dat.train.y, family = "binomial",lambda=cvfit$lambda.min)

        dat.test.x<-model.matrix(y ~ .,bank.additional.sample.test[-11])
        fit.pred.lasso <- predict(lasso.log.final, newx = dat.test.x, type = "response")
        fit.pred.step<-predict(step.model,newdata=bank.additional.sample.test,type="response")
#confusion matrix for LASSO (Suchi)

#Using cutoff of 0.5 to make the classification.
        cutoff<-0.5
        class.lasso<-factor(ifelse(fit.pred.lasso>cutoff,"yes","no"),levels=c("no","yes"))
        class.step<-factor(ifelse(fit.pred.step>cutoff,"yes","no"),levels=c("no","yes"))

#Confusion Matrix for Lasso
        conf.lasso<-table(class.lasso,bank.additional.sample.test$y)
        print("Confusion matrix for LASSO")
## [1] "Confusion matrix for LASSO"
        conf.lasso
##            
## class.lasso  no yes
##         no  406 204
##         yes  81 309
        conf.step<-table(class.step,bank.additional.sample.test$y)
        print("Confusion matrix for Stepwise")
## [1] "Confusion matrix for Stepwise"
        conf.step
##           
## class.step  no yes
##        no  411 208
##        yes  76 305
#Overall Accuracy of LASSO and Stepwise
        print("Overall accuracy for LASSO and Stepwise respectively")
## [1] "Overall accuracy for LASSO and Stepwise respectively"
        sum(diag(conf.lasso))/sum(conf.lasso)
## [1] 0.715
        sum(diag(conf.step))/sum(conf.step)
## [1] 0.716
# ROC plot for LASSO (Suchi)
        results.lasso<-prediction(fit.pred.lasso, bank.additional.sample.test$y,label.ordering=c("no","yes"))
        roc.lasso <- performance(results.lasso, measure = "tpr", x.measure = "fpr")
        plot(roc.lasso,colorize = TRUE)
        abline(a=0, b= 1)

Objective 2: You must include one additional logistic regression model which is also a more complicated logistic regression model than in Objective 1.

#Fitting a complex model using 4 variables and 1 interaction (Kenny)
        complex.interaction<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx + cons.price.idx*cons.conf.idx, family="binomial",data=bank.additional.sample.train)
        summary(complex.interaction)
## 
## Call:
## glm(formula = y ~ euribor3m + cons.price.idx + poutcome + cons.conf.idx + 
##     cons.price.idx * cons.conf.idx, family = "binomial", data = bank.additional.sample.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7452  -0.8067  -0.6823   1.0055   1.7825  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  254.48918   56.44127   4.509 6.52e-06 ***
## euribor3m                     -0.55084    0.03446 -15.986  < 2e-16 ***
## cons.price.idx                -2.70092    0.60665  -4.452 8.50e-06 ***
## poutcomenonexistent            0.53884    0.13494   3.993 6.52e-05 ***
## poutcomesuccess                1.98992    0.28157   7.067 1.58e-12 ***
## cons.conf.idx                  6.91006    1.37533   5.024 5.05e-07 ***
## cons.price.idx:cons.conf.idx  -0.07368    0.01478  -4.986 6.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4158.7  on 2999  degrees of freedom
## Residual deviance: 3347.4  on 2993  degrees of freedom
## AIC: 3361.4
## 
## Number of Fisher Scoring iterations: 5
        confint(complex.interaction)
## Waiting for profiling to be done...
##                                    2.5 %       97.5 %
## (Intercept)                  145.6665772 367.12728215
## euribor3m                     -0.6193081  -0.48416238
## cons.price.idx                -3.9113653  -1.53106520
## poutcomenonexistent            0.2750174   0.80429988
## poutcomesuccess                1.4625421   2.57243656
## cons.conf.idx                  4.2661557   9.66177624
## cons.price.idx:cons.conf.idx  -0.1032418  -0.04526933
#Fitting a complex model adding 1 new variable by categorizing euribor3m (Suchi)

#        bank.additional.sample.train$euribor3m_updated <- factor(ifelse(bank.additional.sample.train$euribor3m>median(bank.additional.sample.train$euribor3m),"High","Low"),levels=c("Low","High"))
 #       summary(glm(y~euribor3m_updated , family="binomial",data=bank.additional.sample.train))

 #       complex.logistics.grouping<-glm(y~ euribor3m_updated + age + contact + month + day_of_week + campaign + pdays + previous + emp.var.rate + cons.price.idx + cons.conf.idx , family="binomial",data=bank.additional.sample.train)
#        summary(complex.logistics.grouping)
#        confint(complex.logistics.grouping)
# Performance comparison for simple logistic model + interaction term (Kenny)

        complex.interaction.trained<-glm(y~ euribor3m + cons.price.idx + poutcome + cons.conf.idx + cons.price.idx*cons.conf.idx, family="binomial",data=bank.additional.sample.train)
        fit.complex.interaction.pred<-predict(complex.interaction.trained, newdata = bank.additional.sample.test, type = "response")
        
        cutoff<-0.5
        class.complex.interaction<-factor(ifelse(fit.complex.interaction.pred>cutoff,"yes","no"),levels=c("no","yes"))
  
  #      conf.complex.interaction<-table(class.complex.interaction,bank.additional.sample.test$y)
        conf.complex.interaction<-confusionMatrix(table(class.complex.interaction,bank.additional.sample.test$y), positive = 'yes')
        conf.complex.interaction
## Confusion Matrix and Statistics
## 
##                          
## class.complex.interaction  no yes
##                       no  377 197
##                       yes 110 316
##                                           
##                Accuracy : 0.693           
##                  95% CI : (0.6634, 0.7215)
##     No Information Rate : 0.513           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3884          
##                                           
##  Mcnemar's Test P-Value : 9.188e-07       
##                                           
##             Sensitivity : 0.6160          
##             Specificity : 0.7741          
##          Pos Pred Value : 0.7418          
##          Neg Pred Value : 0.6568          
##              Prevalence : 0.5130          
##          Detection Rate : 0.3160          
##    Detection Prevalence : 0.4260          
##       Balanced Accuracy : 0.6951          
##                                           
##        'Positive' Class : yes             
## 
  #      mean(class.complex.interaction==bank.additional.sample.test$y)
        
        results.complex.interaction<-prediction(fit.complex.interaction.pred,bank.additional.sample.test$y,label.ordering=c("no","yes"))
        roc.complex.interaction = performance(results.complex.interaction, measure = "tpr", x.measure = "fpr")
        plot(roc.complex.interaction,col = "blue")
        plot(roc.simple.logistics,col = "red",add=TRUE)
        plot(roc.lasso,col = "green",add=TRUE)

        legend("bottomright",legend=c("Simple","Added Interaction", "LASSO"),col=c("blue","red","green"),lty=1,lwd=1)
        abline(a=0, b=1)

Create another competing model using just the continuous predictors and use LDA or QDA.

#Objective 2 (Point 3) -- LDA on Sample data (Suchi: Objective 2: Point 3) # Did not use duration variable
        bank.additional.lda <- lda(y ~ ., bank.additional.sample.train[c(1,10:14,16:21)])
        bank.additional.lda.p <- predict(bank.additional.lda, bank.additional.sample.test)$class
        table.lda <- table(bank.additional.lda.p, bank.additional.sample.test$y)
        cm.lda = confusionMatrix(table.lda, positive = 'yes')
        cm.lda
## Confusion Matrix and Statistics
## 
##                      
## bank.additional.lda.p  no yes
##                   no  404 108
##                   yes  83 405
##                                           
##                Accuracy : 0.809           
##                  95% CI : (0.7832, 0.8329)
##     No Information Rate : 0.513           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6182          
##                                           
##  Mcnemar's Test P-Value : 0.08246         
##                                           
##             Sensitivity : 0.7895          
##             Specificity : 0.8296          
##          Pos Pred Value : 0.8299          
##          Neg Pred Value : 0.7891          
##              Prevalence : 0.5130          
##          Detection Rate : 0.4050          
##    Detection Prevalence : 0.4880          
##       Balanced Accuracy : 0.8095          
##                                           
##        'Positive' Class : yes             
## 

(Optional) Use a nonparameteric model approach as a competing model. Random forest or decision tree for predictors that are both categorical and continuous.

#Random Forest (Suchi: Optional) # Did not use duration variable
        bank.rf<- randomForest(y~., data = bank.additional.sample.train[-11], importance=TRUE)
        bank.rf
## 
## Call:
##  randomForest(formula = y ~ ., data = bank.additional.sample.train[-11],      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 25.53%
## Confusion matrix:
##       no yes class.error
## no  1253 260   0.1718440
## yes  506 981   0.3402824
        varImpPlot(bank.rf)

        plot(bank.rf)

        bank.rf.pred<- predict(bank.rf, bank.additional.sample.test)
##Confusion Matrix
        table.rf <- table(bank.rf.pred,bank.additional.sample.test$y)
        cm.rf = confusionMatrix(table.rf, positive = 'yes')
        cm.rf
## Confusion Matrix and Statistics
## 
##             
## bank.rf.pred  no yes
##          no  409 199
##          yes  78 314
##                                           
##                Accuracy : 0.723           
##                  95% CI : (0.6941, 0.7505)
##     No Information Rate : 0.513           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4491          
##                                           
##  Mcnemar's Test P-Value : 5.591e-13       
##                                           
##             Sensitivity : 0.6121          
##             Specificity : 0.8398          
##          Pos Pred Value : 0.8010          
##          Neg Pred Value : 0.6727          
##              Prevalence : 0.5130          
##          Detection Rate : 0.3140          
##    Detection Prevalence : 0.3920          
##       Balanced Accuracy : 0.7260          
##                                           
##        'Positive' Class : yes             
## 

(Optional) Use a nonparameteric model approach as a competing model. k-nearest neighbors approach if just working with continuous predictors.

After running thru 50 iterations on same sample data set and running thru 100 iteration on different split dataset, we see that at K=32, the model has highest performance.

#KNN (Hollie)
set.seed(123)
splitPerc = .75

iterations = 100

numks = 50

masterAcc = matrix(nrow = iterations, ncol = numks)

for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(50), k = numeric(50))
  trainIndices = sample(1:dim(bank.additional.sample)[1],round(splitPerc * dim(bank.additional.sample)[1]))
  train = bank.additional.sample[trainIndices,]
  test = bank.additional.sample[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(train[,c(17,19)],test[,c(17,19)],train$y, prob = TRUE, k = i)
    table(classifications,test$y)
    CM = confusionMatrix(table(classifications,test$y))
    masterAcc[j,i] = CM$overall[1]
  }
  
}
MeanAcc = colMeans(masterAcc)

plot(seq(1,numks,1),MeanAcc, type = "l")

## Loop for many k and one training / test partition

accs = data.frame(accuracy = numeric(50), k = numeric(50))

for(i in 1:50)
{
  classifications = knn(bank.additional.sample.train[,c(17,19)],bank.additional.sample.test[,c(17,19)],bank.additional.sample.train$y, prob = TRUE, k = i)
  table(bank.additional.sample.test$y,classifications)
  CM = confusionMatrix(table(bank.additional.sample.test$y,classifications),positive = 'yes')
  accs$accuracy[i] = CM$overall[1]
  accs$k[i] = i
}

plot(accs$k,accs$accuracy, type = "l", xlab = "k")

# k = 10 (optimal that we found)
classifications = knn(bank.additional.sample.train[,c(17,20)],bank.additional.sample.test[,c(17,20)],bank.additional.sample.train$y, prob = TRUE, k = 10)
table(bank.additional.sample.test$y,classifications)
##      classifications
##        no yes
##   no  416  71
##   yes 225 288
confusionMatrix(table(bank.additional.sample.test$y,classifications) ,positive = 'yes')
## Confusion Matrix and Statistics
## 
##      classifications
##        no yes
##   no  416  71
##   yes 225 288
##                                           
##                Accuracy : 0.704           
##                  95% CI : (0.6746, 0.7322)
##     No Information Rate : 0.641           
##     P-Value [Acc > NIR] : 1.475e-05       
##                                           
##                   Kappa : 0.4123          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8022          
##             Specificity : 0.6490          
##          Pos Pred Value : 0.5614          
##          Neg Pred Value : 0.8542          
##              Prevalence : 0.3590          
##          Detection Rate : 0.2880          
##    Detection Prevalence : 0.5130          
##       Balanced Accuracy : 0.7256          
##                                           
##        'Positive' Class : yes             
## 
cm.knn = confusionMatrix(table(bank.additional.sample.test$y,classifications),positive = 'yes')
cm.knn$overall[1]
## Accuracy 
##    0.704

Provide a summary table of the performance across the competing methods. Summarize the overall findings. A really great report will also give insight as to why the “best” model won out. This is where a thorough EDA will always help.

#Suchi
#Comparing metrics from all model

        print("Simple model Overall Accuracy:")
## [1] "Simple model Overall Accuracy:"
        conf.simple.logistics$overall[1]
## Accuracy 
##    0.716
        print("Complex model Overall Accuracy:")
## [1] "Complex model Overall Accuracy:"
        conf.complex.interaction$overall[1]
## Accuracy 
##    0.693
        print("Random Forest Overall Accuracy:")
## [1] "Random Forest Overall Accuracy:"
        cm.rf$overall[1]
## Accuracy 
##    0.723
        print("LDA Overall Accuracy:")
## [1] "LDA Overall Accuracy:"
        cm.lda$overall[1]
## Accuracy 
##    0.809
        print("KNN Overall Accuracy:")
## [1] "KNN Overall Accuracy:"
        cm.knn$overall[1]
## Accuracy 
##    0.704
        print("Simple model Misclassification rate:")
## [1] "Simple model Misclassification rate:"
        1-sum(diag(conf.simple.logistics$table))/sum(conf.simple.logistics$table)
## [1] 0.284
        print("Complex model Misclassification rate:")
## [1] "Complex model Misclassification rate:"
        1-sum(diag(conf.complex.interaction$table))/sum(conf.complex.interaction$table)
## [1] 0.307
        print("RandomForest Misclassification rate:")
## [1] "RandomForest Misclassification rate:"
        1-sum(diag(cm.rf$table))/sum(cm.rf$table)
## [1] 0.277
        print("LDA Misclassification rate:")
## [1] "LDA Misclassification rate:"
        1-sum(diag(cm.lda$table))/sum(cm.lda$table)
## [1] 0.191
        print("KNN Misclassification rate:")
## [1] "KNN Misclassification rate:"
        1-sum(diag(cm.knn$table))/sum(cm.knn$table)
## [1] 0.296